Read Data

Read data from Cluster Analysis Input Data.xlsx. From examining the file via Excel, we see that the data is located in the tab named “Database”. We call that tab in the read_xlsx function.

cluster_dat <- readxl::read_xlsx("Cluster Analysis Input Data.xlsx",sheet = 'Database')

Examine cluster_dat.

str(cluster_dat)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1825 obs. of  23 variables:
##  $ Time Bin         : POSIXct, format: "2017-01-01 00:00:00" "2017-01-01 05:00:00" ...
##  $ X__1             : POSIXct, format: "2017-01-01" "2017-01-01" ...
##  $ X__2             : POSIXct, format: "2017-01-01" "2017-01-01" ...
##  $ X__3             : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 05:00:00" ...
##  $ X__4             : num  22 23 24 25 26 22 23 24 25 26 ...
##  $ Density          : num  2.25 2.64 8.93 9.7 4.33 ...
##  $ X__5             : num  17 18 19 20 21 17 18 19 20 21 ...
##  $ Volume           : num  68531 86294 239073 318088 136445 ...
##  $ X__6             : num  27 28 29 30 31 27 28 29 30 31 ...
##  $ Speed            : num  72.7 74.4 75.4 74.1 73.3 ...
##  $ Light Snow       : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Moderate Snow    : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Heavy Snow       : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Light Rain       : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Moderate Rain    : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Heavy Rain       : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Incident Count   : num  0 0 2 1 0 0 2 0 5 0 ...
##  $ Incident Duration: POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Incident Impact  : num  0 0 2.5 1.5 0 0 2 0 8.5 0 ...
##  $ Roadwork Count   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Roadwork Duration: POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
##  $ Roadwork Impact  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Week             : num  1 1 1 1 1 1 1 1 1 1 ...

First, we clean up the column headers by removing any potential trailing or leading spaces and replacing special characters in the strings with ’_’.

colnames(cluster_dat) <-trimws(colnames(cluster_dat))

colnames(cluster_dat) <-gsub('([[:punct:]])|\\s+','_',colnames(cluster_dat))

colnames(cluster_dat)
##  [1] "Time_Bin"          "X__1"              "X__2"             
##  [4] "X__3"              "X__4"              "Density"          
##  [7] "X__5"              "Volume"            "X__6"             
## [10] "Speed"             "Light_Snow"        "Moderate_Snow"    
## [13] "Heavy_Snow"        "Light_Rain"        "Moderate_Rain"    
## [16] "Heavy_Rain"        "Incident_Count"    "Incident_Duration"
## [19] "Incident_Impact"   "Roadwork_Count"    "Roadwork_Duration"
## [22] "Roadwork_Impact"   "Week"

DateTime

We look to fix the DateTime fields. The first five columns looks to be parsed versions of a DateTime. The ‘Time Bin’ looks to be the most complete DateTime variable. Examine further…

class(cluster_dat$Time_Bin)
## [1] "POSIXct" "POSIXt"
cluster_dat$Time_Bin <- as.POSIXct(as.character(cluster_dat$Time_Bin),tz="US/Central")
summary(cluster_dat$Time_Bin)
##                  Min.               1st Qu.                Median 
## "2017-01-01 00:00:00" "2017-04-02 05:00:00" "2017-07-02 10:00:00" 
##                  Mean               3rd Qu.                  Max. 
## "2017-07-02 09:56:52" "2017-10-01 14:00:00" "2017-12-31 19:00:00"
table(is.na(cluster_dat$Time_Bin))
## 
## FALSE 
##  1825

Everything seems to be in order. But we will keep an eye out for potential errors.

The last column is labelled week. Explore:

summary(cluster_dat$Week)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   14.00   27.00   26.57   40.00   53.00

Week 53 does not look right. Lets investigate further.

cluster_dat[cluster_dat$Week==53,c(1:5,length(cluster_dat))]

Week 53 represents the last day of the year. We will ignore this field.

Traffic Incidents

Incidents

summary(cluster_dat$Incident_Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   2.234   3.000  30.000
hist(cluster_dat$Incident_Count,breaks = 30)

Majority of days have no incidents.

Explore the content of Incident Impact and Incident Count.

summary(cluster_dat$Incident_Impact)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   2.877   4.000  38.000
par(mfrow=c(2,2))

hist(cluster_dat$Incident_Impact,breaks=seq(0,40,1),
     xlab='Incident_Impact')

plot(cluster_dat$Incident_Impact[cluster_dat$Incident_Count!=0],cluster_dat$Incident_Count[cluster_dat$Incident_Count!=0],
     xlab="Incident_Impact",ylab='Incident_Count')

plot(cluster_dat$Time_Bin[cluster_dat$Incident_Count!=0],cluster_dat$Incident_Count[cluster_dat$Incident_Count!=0],
     xlab="Time",ylab='Incident_Count')

plot(cluster_dat$Density[cluster_dat$Incident_Count!=0],cluster_dat$Incident_Count[cluster_dat$Incident_Count!=0],
     xlab="Density",ylab='Incident_Count')

It appears that the incident count measures the amount on incidents in a specific time bin. The incident impact tracks how many impacts happened in a specific time_bin. This will allow us to determine how bad each time_bin is in terms of incidents.

There are no easily detectable patterns when plotting incidents vs time but we do see a strong positive correlation between Density and Incidents counts. This makes sense as intuitively the more cars, the more incidents.

We will focus on Incident Duration as a variable. We will add a variable ‘Incident’ that is a total incident minutes for each time bin.

##create a total sum value for minutes of incidents in each time bin.
cluster_dat_incident_melt <- reshape2::melt(cluster_dat,id.vars="Time_Bin",measure.vars='Incident_Duration')%>%
  mutate(Hrs = as.numeric(format(value,"%H")),
         Mins = as.numeric(format(value,"%M")))%>%
  mutate(total = Hrs*60+Mins)%>%
  group_by(Time_Bin)%>%
  summarise(sum = sum(total))
  
cluster_dat$Incident <- cluster_dat_incident_melt$sum 

Roadwork

summary(cluster_dat$Roadwork_Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1118  0.0000  7.0000
table(cluster_dat$Roadwork_Count)
## 
##    0    1    2    3    4    5    7 
## 1689   97   24    7    4    3    1
par(mfrow=c(2,2))

hist(cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],xlab='Roadwork_Impact',main='Roadwork_Count')

plot(cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],cluster_dat$Roadwork_Impact[cluster_dat$Roadwork_Count!=0],
     xlab="Roadwork_Count",ylab='Roadwork_Impact')

plot(cluster_dat$Time_Bin[cluster_dat$Roadwork_Count!=0],cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],
     xlab="Time",ylab='Roadwork_Count')

plot(cluster_dat$Density[cluster_dat$Roadwork_Count!=0],cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],
     xlab="Density",ylab='Roadwork_Count')

x <- as.numeric(format(cluster_dat$Roadwork_Duration,"%H"))
x <- x[x>0]
table(x)
## x
##  1  2  3  4  5  6  7  8  9 10 11 22 
## 27 11 14  9  6  3  2  3  1  1  1  1

Majority of days have no Roadwork. One really bad day with 7. No obvious outliers.

Metrics

Lets look at the Density, Volume, Speed variables. Again there are columns without names mixed in between these fields. We will ignore the unnamed variables.

Date Time Fields

cluster_dat <- cluster_dat %>%
  mutate(Hour = format(Time_Bin,"%H"),
         Minute = format(Time_Bin,"%M"),
         Weekday = weekdays(cluster_dat$Time_Bin),
         Week = lubridate::week(cluster_dat$Time_Bin),
         Month = months(cluster_dat$Time_Bin),
         YearDay = yday(cluster_dat$Time_Bin))%>%
  filter(Hour == 14)%>%
  as.data.frame()




cluster_dat$Weekday <-  factor(cluster_dat$Weekday,levels=c("Sunday", "Monday", 
    "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
cluster_dat$Month <- factor(cluster_dat$Month,levels=c("January","February","March",
               "April","May","June","July","August","September",
               "October","November","December"),ordered=TRUE)

Density

summary(cluster_dat$Density)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.19   13.57   21.41   20.09   24.59   36.69
br_dens <- seq(0,50,2)
ranges_dens = paste(head(br_dens,-1), br_dens[-1], sep=" - ")
freq_dens <- hist(cluster_dat$Density, breaks=br_dens, include.lowest=TRUE, plot=FALSE)

freq_table_dens <-  data.frame(data.frame(range = ranges_dens, frequency = freq_dens$counts))
freq_table_dens[order(-freq_table_dens$frequency),]
plot(freq_dens)

The histogram is right skewed but no major outliers are present. There are also no 0 or NA values.

We now plot the Density vs Time.

ggplot(cluster_dat,aes(x=Weekday,y=Density,color=Weekday))+
  geom_violin()+
  geom_jitter(alpha=0.25)

Again, there do not seem to major outliers. There looks to be some patterns here.

The Density is obviously lower on weekends compared to weekdays.

Volume

summary(cluster_dat$Volume)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  296556  430715  530181  491952  547131  569488
br_vol <- seq(0,600000,20000)
ranges_vol = paste(head(br_vol,-1), br_vol[-1], sep=" - ")
freq_vol <- hist(cluster_dat$Volume, breaks=br_vol, include.lowest=TRUE, plot=FALSE)

freq_table_vol <-  data.frame(data.frame(range = ranges_vol, frequency = freq_vol$counts))
freq_table_vol[order(-freq_table_vol$frequency),]
plot(freq_vol)

It appears that the bin 40,000-60,000 contains the most records.

Again some interesting patterns but no major outliers.

We move on to speed….

Speed

We will explore the Speed variable.

summary(cluster_dat$Speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   36.79   61.92   65.84   66.01   71.86   75.11
br_speed <- seq(30,100,2)
ranges_speed = paste(head(br_speed,-1), br_speed[-1], sep=" - ")
freq_speed <- hist(cluster_dat$Speed, breaks=br_speed, include.lowest=TRUE, plot=FALSE)

freq_table_speed <-  data.frame(data.frame(range = ranges_speed, frequency = freq_speed$counts))
freq_table_speed[order(-freq_table_speed$frequency),]
plot(freq_speed)

The speed plot looks more normal in shape than the previous distributions but does appear to have a left skew. Again there are no major outliers to be worried about nor are there are any 0 or NAs.

Lets plot speed vs time

means <-  aggregate(Speed~Weekday,cluster_dat,mean)
ggplot(cluster_dat,aes(x=Weekday,y=Speed))+
  geom_violin(aes(color=Weekday))+
  geom_jitter(aes(color=Weekday),alpha=0.25)+
  stat_summary(fun.y=mean, colour="#990000", geom="point", 
               shape=18, size=3,show_guide = FALSE)+
  geom_text(data=means, aes(label = paste('Mean:\n',round(Speed,2),sep=''), y = Speed + 2.5),size=3,fontface='bold')+
  ylab('Speed mph')+
  theme(legend.position="none")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.

means <-  aggregate(Speed~Month,cluster_dat,mean)
ggplot(cluster_dat,aes(x=Month,y=Speed))+
  geom_violin(aes(color=Month))+
  geom_jitter(aes(color=Month),alpha=0.25)+
  stat_summary(fun.y=mean, colour="#990000", geom="point", 
               shape=18, size=3,show_guide = FALSE)+
  geom_text(data=means, aes(label = paste('Mean:\n',round(Speed,2),sep=''), y = Speed + 2.5),color='darkred',size=3,fontface='bold')+
  ylab('Speed mph')+
  theme(legend.position="none")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.

Looks like we have some reduced speeds in the Summer months, maybe due to construction activities. There are also some slower days in the winter months which could very easily be due to bad weather. But there are no noticeable patterns.

ggplot(cluster_dat,aes(x=Week,y=Speed,color=Weekday))+
  geom_point(aplha=0.25)+
  geom_smooth()
## Warning: Ignoring unknown parameters: aplha
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Again we are seeing a pattern with reduced speeds on weekends.

Lets look at public holidays:

hols_2017 <- as.POSIXct(c('2017-01-02 14:00','2017-05-29 14:00','2017-07-03 14:00','2017-07-04 14:00','2017-09-04 14:00','2017-11-23 14:00','2017-11-24 14:00','2017-12-25 14:00'))
table(cluster_dat$Time_Bin %in% hols_2017)
## 
## FALSE  TRUE 
##   357     8
cluster_dat$hols_2017 <- cluster_dat$Time_Bin %in% hols_2017

g <- ggplot(cluster_dat,aes(x=Weekday,y=Density))+
  geom_point(aes(alpha=0.25,color=hols_2017))
plotly::ggplotly(g)
s <- ggplot(cluster_dat,aes(x=Weekday,y=Speed))+
  geom_point(aes(alpha=0.25,color=hols_2017))
plotly::ggplotly(s)

Public holidays seem to be the weekdays with highest speeds.

Weather

We need to generate a total minute count for weather events in each time bin.

##create a total sum value for minutes of weather events in each time bin.
cluster_dat_melt_RS <- reshape2::melt(cluster_dat,id.vars="Time_Bin",measure.vars=c("Light_Rain","Light_Snow","Moderate_Rain","Moderate_Snow","Heavy_Snow","Heavy_Rain"))%>%
  mutate(Hrs = as.numeric(format(value,"%H")),
         Mins = as.numeric(format(value,"%M")),
         Percip = str_sub(variable,-4))%>%
  mutate(total = Hrs*60+Mins)%>%
  group_by(Time_Bin,Percip)%>%
  summarise(sum = sum(total))%>%
  dcast(Time_Bin~Percip,value.var = 'sum')
  
cluster_dat_melt_weather <- reshape2::melt(cluster_dat,id.vars="Time_Bin",measure.vars=colnames(cluster_dat)[11:16])%>%
  mutate(Hrs = as.numeric(format(value,"%H")),
         Mins = as.numeric(format(value,"%M")))%>%
  mutate(total = Hrs*60+Mins)%>%
  group_by(Time_Bin)%>%
  summarise(sum = sum(total))
  
cluster_dat$weather <- cluster_dat_melt_weather$sum

cluster_dat$Rain <- cluster_dat_melt_RS$Rain
cluster_dat$Snow <- cluster_dat_melt_RS$Snow

rm(cluster_dat_melt_RS,cluster_dat_melt_weather)

Now we filter out the weekends. We already determined that they will have their own cluster.

Filter Weekends

##filter out the weekends
'%ni%' <- Negate('%in%')
cluster_dat_midweek <- cluster_dat %>% 
  filter(Weekday %ni% c('Saturday','Sunday'))

## K Means Clustering {.tabset .tabset-fade}

Clustering

Day of Year vs Speed

nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,Rain,Snow,Incident)%>%scale(), min.nc=2, max.nc=15, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 5 proposed 5 as the best number of clusters 
## * 3 proposed 6 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 2 proposed 11 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  5 
##  
##  
## *******************************************************************

We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.

set.seed(250)
#run kmeans
speed_cluster <-  kmeans(cluster_dat_midweek%>%select(Speed,YearDay,Rain,Snow,Incident)%>%scale(),centers = 5,nstart = 25)

##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)

We will look at some plots to display the clusters.

2D Plot Year Day ~ Speed

A standard 2d plot.

plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,color=~cluster,type='scatter',mode='markers',text=~paste('Time_Bin:', format(Time_Bin,"%Y-%m-%d")))

The 2d plot does not do a good job of showing where the clusters split.

Year Day ~ Speed ~ Weather

set.seed(250)
nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,weather)%>%scale(), min.nc=2, max.nc=15, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 3 proposed 2 as the best number of clusters 
## * 5 proposed 3 as the best number of clusters 
## * 7 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 2 proposed 7 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.

set.seed(250)
#run kmeans
speed_cluster <-  kmeans(cluster_dat_midweek%>%select(Speed,YearDay,weather)%>%scale(),centers = 5,nstart = 25)

##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)

We will look at some plots to display the clusters.

2D Plot Year Day ~ Speed

First a simple 2d plot

plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,color=~cluster,type='scatter',mode='markers',text=~paste('Time_Bin:', format(Time_Bin,"%Y-%m-%d")))

The 2d plot does not do a good job of showing where the clusters split.

3D Plot Year Day ~ Speed ~ Weather

we will try a 3d plot with weather on the z axis.

#3d plot to better show

plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~weather,
                color=~cluster, hoverinfo='text',
                text = ~paste('Speed:', round(Speed,2), '<br>Weather mins:', weather, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>% 
  add_markers()%>%
    layout(scene = list(xaxis = list(title = 'Day of Year',
                                     range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
                     yaxis = list(title = 'Speed MPH',
                                  range = c(min(cluster_dat_midweek$Speed),
                                            max(cluster_dat_midweek$Speed))),
                     zaxis = list(title = 'Weather mins',
                                  range = c(min(cluster_dat_midweek$weather),
                                            max(cluster_dat_midweek$weather)))))

Much better.

Year Day ~ Speed ~ Weather ~ Incident

set.seed(250)
nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,weather, Incident)%>%scale(), min.nc=2, max.nc=15, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 2 proposed 13 as the best number of clusters 
## * 5 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.

set.seed(250)
#run kmeans
speed_cluster <-  kmeans(cluster_dat_midweek%>%select(Speed,YearDay,weather, Incident)%>%scale(),centers = 5,nstart = 25)

##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)

We will look at some plots to display the clusters.

#3d plot to better show

plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~weather,
                color=~cluster, hoverinfo='text',
                text = ~paste('Speed:', round(Speed,2), '<br>Weather mins:', weather, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>% 
  add_markers()%>%
    layout(scene = list(xaxis = list(title = 'Day of Year',
                                     range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
                     yaxis = list(title = 'Speed MPH',
                                  range = c(min(cluster_dat_midweek$Speed),
                                            max(cluster_dat_midweek$Speed))),
                     zaxis = list(title = 'Weather mins',
                                  range = c(min(cluster_dat_midweek$weather),
                                            max(cluster_dat_midweek$weather)))))

Much better.

3D Plot Year Day ~ Speed, Incidents

Now lets look at Incidents on the z axis.

#3d plot to better show

plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~Incident,
                color=~cluster, hoverinfo='text',
                text = ~paste('Speed:', round(Speed,2), '<br>Incidents mins:', Incident, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>% 
  add_markers()%>%
    layout(scene = list(xaxis = list(title = 'Day of Year',
                                     range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
                     yaxis = list(title = 'Speed MPH',
                                  range = c(min(cluster_dat_midweek$Speed),
                                            max(cluster_dat_midweek$Speed))),
                     zaxis = list(title = 'Incident mins',
                                  range = c(min(cluster_dat_midweek$Incident),
                                            max(cluster_dat_midweek$Incident)))))

Year Day ~ Speed ~Rain ~ Snow ~ Incident

set.seed(250)
nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,Rain, Snow, Incident)%>%scale(), min.nc=2, max.nc=15, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 5 proposed 5 as the best number of clusters 
## * 3 proposed 6 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 2 proposed 11 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  5 
##  
##  
## *******************************************************************

We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.

set.seed(250)
#run kmeans
speed_cluster <-  kmeans(cluster_dat_midweek%>%select(Speed,YearDay,Rain,Snow, Incident)%>%scale(),centers = 5,nstart = 25)

##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)

We will look at some plots to display the clusters.

#3d plot to better show

plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~Snow,
                color=~cluster, hoverinfo='text',
                text = ~paste('Speed:', round(Speed,2), '<br>Weather mins:', weather, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>% 
  add_markers()%>%
    layout(scene = list(xaxis = list(title = 'Day of Year',
                                     range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
                     yaxis = list(title = 'Speed MPH',
                                  range = c(min(cluster_dat_midweek$Speed),
                                            max(cluster_dat_midweek$Speed))),
                     zaxis = list(title = 'Snow mins',
                                  range = c(min(cluster_dat_midweek$Snow),
                                            max(cluster_dat_midweek$Snow)))))

Much better.

3D Plot Year Day ~ Speed, Incidents

Now lets look at Incidents on the z axis.

#3d plot to better show

plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~Incident,
                color=~cluster, hoverinfo='text',
                text = ~paste('Speed:', round(Speed,2), '<br>Incidents mins:', Incident, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>% 
  add_markers()%>%
    layout(scene = list(xaxis = list(title = 'Day of Year',
                                     range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
                     yaxis = list(title = 'Speed MPH',
                                  range = c(min(cluster_dat_midweek$Speed),
                                            max(cluster_dat_midweek$Speed))),
                     zaxis = list(title = 'Incident mins',
                                  range = c(min(cluster_dat_midweek$Incident),
                                            max(cluster_dat_midweek$Incident)))))